This vignette demonstrates how to use the R6 class nppCART implemented in the R package nppR in order to estimate unit-level self-selection propensities for units in a non-probability sample, by incorporating information from a compatible auxiliary probability sample.

The nppCART method is non-parametric and is designed to handle scenarios possibly with strong non-linearities or interactions among the variables involved.

The nppCART methodology was presented at the 2019 Annual Meeting (in Calgary) of the Statistical Society of Canada:

The proceedings can be downloaded from: https://ssc.ca/en/2019-annual-meeting-calgary

The article is also included as a vignette in the nppR package.

Outline of this vignette:

Section 1. Generate the synthetic population data frame.

For reproducibility, we start by setting globally the randomization seed:

base::set.seed(7654321);

Next, we load the required R packages.

library(ggplot2);
library(nppR);

The following code segment generates data for the synthetic population and store the data in the data frame DF.population:

population.size <- 10000;

temp.centres <- c(0.5,1.5,2.5);

c1 <- sample(x = temp.centres, size = population.size, replace = TRUE);
c2 <- sample(x = temp.centres, size = population.size, replace = TRUE);

true.propensity <- rnorm(n = population.size, mean = 0.25, sd = 0.025);
is.high.propensity <- (c2 - c1 == 1 | c2 - c1 == -1);
true.propensity[is.high.propensity] <- rnorm(n = sum(is.high.propensity), mean = 0.75, sd = 0.025);

sigma <- 0.20;
x1 <- c1 + rnorm(n = population.size, mean = 0, sd = sigma);
x2 <- c2 + rnorm(n = population.size, mean = 0, sd = sigma);

y0 <- rep(x = 30, times = population.size);
y0[is.high.propensity] <- 110;

epsilon <- rnorm(n = population.size, mean = 0, sd = 1.0)
y <- y0 + epsilon^2;

DF.population <- data.frame(
    unit.ID         = seq(1,population.size),
    y               = y,
    x1.numeric      = x1,
    x2.numeric      = x2,
    true.propensity = true.propensity
    );

for ( colname.numeric in c("x1.numeric","x2.numeric") ) {

    temp.quantiles <- quantile(
        x     = DF.population[,colname.numeric],
        probs = c(1,2,3)/3
        );

    is.small  <- ifelse(DF.population[,colname.numeric] <  temp.quantiles[1],TRUE,FALSE);
    is.medium <- ifelse(DF.population[,colname.numeric] >= temp.quantiles[1] & DF.population[,colname.numeric] < temp.quantiles[2],TRUE,FALSE);
    is.large  <- ifelse(DF.population[,colname.numeric] >= temp.quantiles[2],TRUE,FALSE);

    colname.factor <- gsub(x = colname.numeric, pattern = "\\.numeric", replacement = "");
    DF.population[,colname.factor] <- character(nrow(DF.population));

    if ( "x1.numeric" == colname.numeric ) {
        DF.population[is.small, colname.factor] <- "small";
        DF.population[is.medium,colname.factor] <- "medium";
        DF.population[is.large, colname.factor] <- "large";
        temp.levels <- c("small","medium","large");
    } else {
        DF.population[is.small, colname.factor] <- "petit";
        DF.population[is.medium,colname.factor] <- "moyen";
        DF.population[is.large, colname.factor] <- "grand";
        temp.levels <- c("petit","moyen","grand");
        }

    DF.population[,colname.factor] <- factor(
        x       = DF.population[,colname.factor],
        levels  = temp.levels,
        ordered = TRUE
        );

    colname.jitter <- gsub(x = colname.numeric, pattern = "numeric", replacement = "jitter");
    DF.population[,colname.jitter] <- (-0.5) + as.numeric(DF.population[,colname.factor]) + runif(n = nrow(DF.population), min = -0.3, max = 0.3);

    DF.population <- DF.population[,setdiff(colnames(DF.population),colname.numeric)];

    }

The first few rows of DF.population:

knitr::kable(head(DF.population), align = "c", row.names = FALSE);
unit.ID y true.propensity x1 x1.jitter x2 x2.jitter
1 110.07346 0.7683921 small 0.3992399 moyen 1.2753359
2 30.07629 0.2267301 large 2.6631135 petit 0.7140969
3 110.60617 0.7604559 medium 1.6857998 grand 2.2104801
4 110.09868 0.7326466 medium 1.3638241 grand 2.7652879
5 110.01293 0.7199027 medium 1.3608663 petit 0.3522514
6 111.08307 0.7612071 medium 1.4064630 petit 0.4525272

We remark that y is intended to be the target variable, while x1 and x2 are the predictor variables. nppCART requires only x1 and x2 (and design weights, to be generated later when we select the auxiliary probability sample). The columns x1.jitter and x2.jitter are derived respectively from x1 and x2, purely for visualization purposes (see plots below); in particular, the columns x1.jitter and x2.jitter are not required, and will be ignored, by nppCART.

Here are the structure and summary statistics of DF.population:

str(DF.population);
#> 'data.frame':    10000 obs. of  7 variables:
#>  $ unit.ID        : int  1 2 3 4 5 6 7 8 9 10 ...
#>  $ y              : num  110.1 30.1 110.6 110.1 110 ...
#>  $ true.propensity: num  0.768 0.227 0.76 0.733 0.72 ...
#>  $ x1             : Ord.factor w/ 3 levels "small"<"medium"<..: 1 3 2 2 2 2 1 3 3 1 ...
#>  $ x1.jitter      : num  0.399 2.663 1.686 1.364 1.361 ...
#>  $ x2             : Ord.factor w/ 3 levels "petit"<"moyen"<..: 2 1 3 3 1 1 3 1 1 2 ...
#>  $ x2.jitter      : num  1.275 0.714 2.21 2.765 0.352 ...
summary(DF.population);
#>     unit.ID            y          true.propensity       x1      
#>  Min.   :    1   Min.   : 30.00   Min.   :0.1633   small :3333  
#>  1st Qu.: 2501   1st Qu.: 30.35   1st Qu.:0.2468   medium:3333  
#>  Median : 5000   Median : 32.61   Median :0.2835   large :3334  
#>  Mean   : 5000   Mean   : 66.74   Mean   :0.4738                
#>  3rd Qu.: 7500   3rd Qu.:110.34   3rd Qu.:0.7468                
#>  Max.   :10000   Max.   :125.25   Max.   :0.8542                
#>    x1.jitter          x2         x2.jitter     
#>  Min.   :0.2002   petit:3333   Min.   :0.2002  
#>  1st Qu.:0.6551   moyen:3333   1st Qu.:0.6480  
#>  Median :1.4956   grand:3334   Median :1.5016  
#>  Mean   :1.5008                Mean   :1.5029  
#>  3rd Qu.:2.3577                3rd Qu.:2.3522  
#>  Max.   :2.8000                Max.   :2.7999

The following plot illustrates the strong interaction between the true propensity and the predictor variables x1 and x2 in DF.population:

textsize.title <- 30;
textsize.axis  <- 20;

my.ggplot <- ggplot(data = NULL) + theme_bw();
my.ggplot <- my.ggplot + theme(
    title            = element_text(size = textsize.title, face = "bold"),
    axis.text.x      = element_text(size = textsize.axis,  face = "bold", angle = 0, vjust = 0.5, hjust = 0.5),
    axis.text.y      = element_text(size = textsize.axis,  face = "bold"),
    axis.title.x     = element_text(size = textsize.axis,  face = "bold"),
    axis.title.y     = element_text(size = textsize.axis,  face = "bold"),
    legend.title     = element_text(size = textsize.axis,  face = "bold"),
    legend.text      = element_text(size = textsize.axis,  face = "bold"),
    panel.grid.major = element_line(colour = "gray", linetype = 2, size = 0.25),
    panel.grid.minor = element_line(colour = "gray", linetype = 2, size = 0.25),
    legend.position  = "bottom",
    legend.key.width = ggplot2::unit(0.75,"in")
    );

my.ggplot <- my.ggplot + labs(
    title    = NULL,
    subtitle = "Population",
    colour   = "true propensity       "
    );

my.ggplot <- my.ggplot + geom_hline(yintercept = 0, colour = "gray", size = 0.75);
my.ggplot <- my.ggplot + geom_vline(xintercept = 0, colour = "gray", size = 0.75);

my.ggplot <- my.ggplot + scale_x_continuous(
    limits = c(0,3),
    breaks = c(0.5,1.5,2.5),
    labels = levels(DF.population[,'x1'])
    );
my.ggplot <- my.ggplot + scale_y_continuous(
    limits = c(0,3),
    breaks = c(0.5,1.5,2.5),
    labels = levels(DF.population[,'x2'])
    );

my.ggplot <- my.ggplot + xlab("x1 (jittered)");
my.ggplot <- my.ggplot + ylab("x2 (jittered)");

my.ggplot <- my.ggplot + scale_colour_gradient(
    limits = c(0,1),
    breaks = c(0,0.25,0.5,0.75,1),
    low    = "black",
    high   = "red"
    );

my.ggplot <- my.ggplot + geom_point(
    data    = DF.population,
    mapping = aes(x = x1.jitter, y = x2.jitter, colour = true.propensity),
    alpha   = 0.2
    );

my.ggplot;

Note that the predictor variables we will use below are the ordinal (in particular, categorical) variables x1 and x2. In the above plot, we used however jittered numericized versions of these two variables instead for easy visualization.

Note also that the colour gradient in the above plot indicates the true (though synthetic) values of the self-selection propensities in this example. We will assess the efficacy of nppCART, by generating the counterpart of the above plot for the estimated propensity values produced by nppCART; see Appendix A.

Section 2. Generate data frames for the non-probability and probability samples.

We now form the non-probability sample, a Poisson sample from the synthetic population each of whose units is selected – independently from all other units – according to its own unit-specific (true) self-selection propensity. We store data for the non-probability sample in the data frame DF.non.probability.

DF.non.probability <- DF.population;
DF.non.probability[,"self.selected"] <- sapply(
    X   = DF.non.probability[,"true.propensity"],
    FUN = function(x) { sample(x = c(FALSE,TRUE), size = 1, prob = c(1-x,x)) }
    );
DF.non.probability <- DF.non.probability[DF.non.probability[,"self.selected"],c("unit.ID","y","x1","x2","x1.jitter","x2.jitter")];

The first few rows of DF.non.probability:

knitr::kable(head(DF.non.probability), align = "c", row.names = FALSE);
unit.ID y x1 x2 x1.jitter x2.jitter
1 110.07346 small moyen 0.3992399 1.2753359
3 110.60617 medium grand 1.6857998 2.2104801
4 110.09868 medium grand 1.3638241 2.7652879
6 111.08307 medium petit 1.4064630 0.4525272
9 30.15207 large petit 2.3379334 0.5833871
10 110.57663 small moyen 0.3004878 1.3386584

Next, we select the auxiliary probability sample, which is an SRSWOR from the synthetic population, with design selection probability 0.1. We store data for the auxiliary probability sample in the data frame DF.probability.

prob.selection <- 0.1;

is.selected <- sample(
    x       = c(TRUE,FALSE),
    size    = nrow(DF.population),
    replace = TRUE,
    prob    = c(prob.selection, 1 - prob.selection)
    );

DF.probability <- DF.population[is.selected,c("unit.ID","x1","x2")];
DF.probability[,"design.weight"] <- 1 / prob.selection;

The first few rows of DF.probability:

knitr::kable(head(DF.probability), align = "c", row.names = FALSE);
unit.ID x1 x2 design.weight
5 medium petit 10
7 small grand 10
23 large moyen 10
30 large moyen 10
33 medium moyen 10
35 medium moyen 10

Section 3. Compute estimated propensities via nppCART.

Now that we have the two input data sets ready (i.e., the data frames DF.non.probability and DF.probability), we are ready to use nppCART to estimate the self-selection propensities for the units in the non-probability sample, using the auxiliary information fournished by the probability sample.

We start by instantiating an nppCART object, with the two input data sets as follows:

my.nppCART <- nppR::nppCART(
    np.data         = DF.non.probability,
    p.data          = DF.probability,
    predictors      = c("x1","x2"),
    sampling.weight = "design.weight"
    );

Next, we call the nppCART$grow( ) method to grow the classification tree according to the algorithm nppCART.

my.nppCART$grow();
#> Warning in base::max(base::unique(base::as.double(impurity.reductions[!
#> base::is.na(impurity.reductions)]))): no non-missing arguments to max; returning
#> -Inf

#> Warning in base::max(base::unique(base::as.double(impurity.reductions[!
#> base::is.na(impurity.reductions)]))): no non-missing arguments to max; returning
#> -Inf

#> Warning in base::max(base::unique(base::as.double(impurity.reductions[!
#> base::is.na(impurity.reductions)]))): no non-missing arguments to max; returning
#> -Inf

#> Warning in base::max(base::unique(base::as.double(impurity.reductions[!
#> base::is.na(impurity.reductions)]))): no non-missing arguments to max; returning
#> -Inf

#> Warning in base::max(base::unique(base::as.double(impurity.reductions[!
#> base::is.na(impurity.reductions)]))): no non-missing arguments to max; returning
#> -Inf

#> Warning in base::max(base::unique(base::as.double(impurity.reductions[!
#> base::is.na(impurity.reductions)]))): no non-missing arguments to max; returning
#> -Inf

#> Warning in base::max(base::unique(base::as.double(impurity.reductions[!
#> base::is.na(impurity.reductions)]))): no non-missing arguments to max; returning
#> -Inf

#> Warning in base::max(base::unique(base::as.double(impurity.reductions[!
#> base::is.na(impurity.reductions)]))): no non-missing arguments to max; returning
#> -Inf

#> Warning in base::max(base::unique(base::as.double(impurity.reductions[!
#> base::is.na(impurity.reductions)]))): no non-missing arguments to max; returning
#> -Inf

Once the tree growing is complete, we can examine the fully grown tree with the nppCART$print( ) method:

my.nppCART$print( FUN.format = function(x) {return(round(x,digits=3))} );
#> 
#> (0) [root], impurity = 0.69, np.count = 4703, p.count = 1028
#>   (1) [x1 < 1.5], impurity = 0.665, np.count = 1379, p.count = 361
#>     (3) [x2 < 2.5], impurity = 0.692, np.count = 1086, p.count = 228
#>       (5) [x2 < 1.5], impurity = 0.541, np.count = 264, p.count = 114
#>       (6) [x2 >= 1.5], impurity = 0.592, np.count = 822, p.count = 114
#>     (4) [x2 >= 2.5], impurity = 0.527, np.count = 293, p.count = 133
#>   (2) [x1 >= 1.5], impurity = 0.693, np.count = 3324, p.count = 667
#>     (7) [x1 < 2.5], impurity = 0.687, np.count = 1937, p.count = 348
#>       (9) [x2 < 2.5], impurity = 0.692, np.count = 1090, p.count = 227
#>         (11) [x2 < 1.5], impurity = 0.617, np.count = 804, p.count = 116
#>         (12) [x2 >= 1.5], impurity = 0.571, np.count = 286, p.count = 111
#>       (10) [x2 >= 2.5], impurity = 0.611, np.count = 847, p.count = 121
#>     (8) [x1 >= 2.5], impurity = 0.685, np.count = 1387, p.count = 319
#>       (13) [x2 < 2.5], impurity = 0.692, np.count = 1108, p.count = 212
#>         (15) [x2 < 1.5], impurity = 0.586, np.count = 273, p.count = 100
#>         (16) [x2 >= 1.5], impurity = 0.567, np.count = 835, p.count = 112
#>       (14) [x2 >= 2.5], impurity = 0.574, np.count = 279, p.count = 107

We next extract the estimated propensities, as a data frame, using the nppCART$get_npdata_with_propensity( ) method:

DF.npdata.estimated.propensity <- my.nppCART$get_npdata_with_propensity();

Here are the first few rows of the data frame returned by nppCART$get_npdata_with_propensity( ):

knitr::kable(head(DF.npdata.estimated.propensity), align = "c", row.names = FALSE);
unit.ID y x1 x2 x1.jitter x2.jitter nodeID propensity np.count p.weight impurity
1 110.07346 small moyen 0.3992399 1.2753359 6 0.7210526 822 1140 0.5919564
3 110.60617 medium grand 1.6857998 2.2104801 10 0.7000000 847 1210 0.6108643
4 110.09868 medium grand 1.3638241 2.7652879 10 0.7000000 847 1210 0.6108643
6 111.08307 medium petit 1.4064630 0.4525272 11 0.6931034 804 1160 0.6165950
9 30.15207 large petit 2.3379334 0.5833871 15 0.2730000 273 1000 0.5862199
10 110.57663 small moyen 0.3004878 1.3386584 6 0.7210526 822 1140 0.5919564

Note that the returned data frame DF.npdata.estimated.propensity is in fact the original input data frame DF.non.probability (passed in to nppCART via the input parameter np.data) with a few additional columns augmented at the right. Please refer to help pages of the package for full details.

In particular, the desired unit-specific (more precisely, specific to terminal nodes of the fully grown tree) estimated self-selection propensities are under the propensity column.

Appendix A. Assessment of results on the synthetic data sets above.

We make direct comparison between the nppCART-estimated self-selection propensities against their respective (true but synthetic) values. Obviously, this type of accuracy assessment is generally not possible in practice, since the true values of the self-selection propensities are either unknown, or if they were known somehow, then there would be no need to estimate them.

First, we attach the true propensity values to the output data frame DF.npdata.estimated.propensity of nppCART$get_npdata_with_propensity( ), renaming the column propensity to estimated.propensity for clarity:

colnames(DF.npdata.estimated.propensity) <- gsub(
    x           = colnames(DF.npdata.estimated.propensity),
    pattern     = "^propensity$",
    replacement = "estimated.propensity"
    );
DF.npdata.estimated.propensity <- merge(
    x  = DF.npdata.estimated.propensity,
    y  = DF.population[,c("unit.ID","true.propensity")],
    by = "unit.ID"
    );
DF.npdata.estimated.propensity <- DF.npdata.estimated.propensity[order(DF.npdata.estimated.propensity[,"unit.ID"]),];

Here are the first few rows of the resulting data frame:

knitr::kable(head(DF.npdata.estimated.propensity), align = "c", row.names = FALSE);
unit.ID y x1 x2 x1.jitter x2.jitter nodeID estimated.propensity np.count p.weight impurity true.propensity
1 110.07346 small moyen 0.3992399 1.2753359 6 0.7210526 822 1140 0.5919564 0.7683921
3 110.60617 medium grand 1.6857998 2.2104801 10 0.7000000 847 1210 0.6108643 0.7604559
4 110.09868 medium grand 1.3638241 2.7652879 10 0.7000000 847 1210 0.6108643 0.7326466
6 111.08307 medium petit 1.4064630 0.4525272 11 0.6931034 804 1160 0.6165950 0.7612071
9 30.15207 large petit 2.3379334 0.5833871 15 0.2730000 273 1000 0.5862199 0.2414681
10 110.57663 small moyen 0.3004878 1.3386584 6 0.7210526 822 1140 0.5919564 0.7381769

Next, we generate the estimated propensity plot, as promised in Section 1; this plot is the counterpart to the one in Section 1 for the nppCART-estimated propensities:

my.ggplot <- ggplot(data = NULL) + theme_bw();
my.ggplot <- my.ggplot + theme(
    title            = element_text(size = textsize.title, face = "bold"),
    axis.text.x      = element_text(size = textsize.axis,  face = "bold", angle = 0, vjust = 0.5, hjust = 0.5),
    axis.text.y      = element_text(size = textsize.axis,  face = "bold"),
    axis.title.x     = element_text(size = textsize.axis,  face = "bold"),
    axis.title.y     = element_text(size = textsize.axis,  face = "bold"),
    legend.title     = element_text(size = textsize.axis,  face = "bold"),
    legend.text      = element_text(size = textsize.axis,  face = "bold"),
    panel.grid.major = element_line(colour = "gray", linetype = 2, size = 0.25),
    panel.grid.minor = element_line(colour = "gray", linetype = 2, size = 0.25),
    legend.position  = "bottom",
    legend.key.width = ggplot2::unit(0.75,"in")
    );

my.ggplot <- my.ggplot + labs(
    title    = NULL,
    subtitle = "Non-probability sample",
    colour   = "estimated propensity       "
    );

my.ggplot <- my.ggplot + geom_hline(yintercept = 0,colour="gray",size=0.75);
my.ggplot <- my.ggplot + geom_vline(xintercept = 0,colour="gray",size=0.75);

my.ggplot <- my.ggplot + scale_x_continuous(
    limits = c(0,3),
    breaks = c(0.5,1.5,2.5),
    labels = levels(DF.population[,'x1'])
    );
my.ggplot <- my.ggplot + scale_y_continuous(
    limits = c(0,3),
    breaks = c(0.5,1.5,2.5),
    labels = levels(DF.population[,'x2'])
    );

my.ggplot <- my.ggplot + xlab("x1 (jittered)");
my.ggplot <- my.ggplot + ylab("x2 (jittered)");

my.ggplot <- my.ggplot + scale_colour_gradient(
    limits = c(0,1),
    breaks = c(0,0.25,0.5,0.75,1),
    low    = "black",
    high   = "red"
    );

my.ggplot <- my.ggplot + geom_point(
    data    = DF.npdata.estimated.propensity,
    mapping = aes(x = x1.jitter, y = x2.jitter, colour = estimated.propensity),
    alpha   = 0.2
    );

my.ggplot;

Compare the plot immediately above with its counterpart in Section 1, and note that nppCART is able to re-construct approximately the interaction patterns between self-selection propensity and (x1, x2).

Lastly, we compare directly the true and nppCART-estimated values of the self-selection propensities for units in the non-probability samples via a hexbin plot (alternative to the scatter plot for large data sets):

my.ggplot <- ggplot(data = NULL) + theme_bw();
my.ggplot <- my.ggplot + theme(
    title            = element_text(size = textsize.title, face = "bold"),
    axis.text.x      = element_text(size = textsize.axis,  face = "bold"),
    axis.text.y      = element_text(size = textsize.axis,  face = "bold"),
    axis.title.x     = element_text(size = textsize.axis,  face = "bold"),
    axis.title.y     = element_text(size = textsize.axis,  face = "bold"),
    legend.title     = element_text(size = textsize.axis,  face = "bold"),
    legend.text      = element_text(size = textsize.axis,  face = "bold"),
    panel.grid.major = element_line(colour="gray", linetype=2, size=0.25),
    panel.grid.minor = element_line(colour="gray", linetype=2, size=0.25),
    legend.position  = "bottom",
    legend.key.width = ggplot2::unit(1.0,"in")
    );

my.ggplot <- my.ggplot + labs(
    title    = NULL,
    subtitle = "Non-probability sample"
    );

my.ggplot <- my.ggplot + xlab("true propensity");
my.ggplot <- my.ggplot + ylab("estimated propensity");

my.ggplot <- my.ggplot + geom_hline(yintercept = 0, colour = "gray", size = 0.75);
my.ggplot <- my.ggplot + geom_vline(xintercept = 0, colour = "gray", size = 0.75);
my.ggplot <- my.ggplot + scale_x_continuous(limits = c(0,1), breaks = seq(0,1,0.2));
my.ggplot <- my.ggplot + scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.2));

my.ggplot <- my.ggplot + scale_fill_gradient(
    limits = 260 * c(  0,1),
    breaks = 260 * seq(0,1,0.25),
    low    = "lightgrey",
    high   = "red"
    );

my.ggplot <- my.ggplot + geom_hex(
    data     = DF.npdata.estimated.propensity,
    mapping  = aes(x = true.propensity, y = estimated.propensity),
    binwidth = c(0.02,0.02)
    );

my.ggplot;